In [1]:
import pandas as pd
import numpy as np
import datetime
from matplotlib import pyplot as plt
import pickle 

import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio

from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
from plotly.tools import FigureFactory as ff
import plotly.graph_objs as graph

#setting
from IPython.core.interactiveshell import InteractiveShell  
InteractiveShell.ast_node_interactivity = "all"
In [4]:
df = pd.read_csv('/Turnstile_Usage_Data__2019.csv')

description of the variables can be found here: http://web.mta.info/developers/turnstile.html

In [3]:
#simple understanding of dataset
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10467101 entries, 0 to 10467100
Data columns (total 11 columns):
C/A                                                           object
Unit                                                          object
SCP                                                           object
Station                                                       object
Line Name                                                     object
Division                                                      object
Date                                                          object
Time                                                          object
Description                                                   object
Entries                                                       int64
Exits                                                         int64
dtypes: int64(2), object(9)
memory usage: 878.4+ MB
In [5]:
#create unique turnstile id for each turnstile
df['Turnstile_ID'] = df['Unit'].astype(str) + "_"+ df['SCP'].astype(str) +"_"+ df['Station'].astype(str)
In [6]:
#clean up column name
df.rename(columns = {'Exits                                                     ':'Exits'},inplace = True)
In [7]:
#trim dataset so that it runs faster
columns_to_keep = ['Turnstile_ID','Station','Date','Time','Entries','Exits']
df_new = pd.DataFrame(df[columns_to_keep])
In [ ]:
#convert date and time into datetime format
df_new['Date'] = df_new.Date.apply(lambda x: datetime.datetime.strptime(x, '%m/%d/%Y'))
df_new['Time'] = df_new.Time.apply(lambda x: datetime.datetime.strptime(x, '%H:%M:%S').time())
In [26]:
df_new.head()
Out[26]:
Turnstile_ID Station Date Time Entries Exits entry_diff_abs exit_diff_abs flag_high_entry_diff flag_high_exit_diff
0 R170_02-06-00_14 ST-UNION SQ 14 ST-UNION SQ 2019-12-27 00:00:00 769115 559221 0.0 0.0 False False
1 R170_02-00-04_14 ST-UNION SQ 14 ST-UNION SQ 2019-12-27 00:00:00 6483080 4945335 0.0 0.0 False False
2 R170_02-00-03_14 ST-UNION SQ 14 ST-UNION SQ 2019-12-27 00:00:00 7191422 8417203 0.0 0.0 False False
3 R170_02-00-02_14 ST-UNION SQ 14 ST-UNION SQ 2019-12-27 00:00:00 14983900 14554087 0.0 0.0 False False
4 R170_02-06-01_14 ST-UNION SQ 14 ST-UNION SQ 2019-12-27 00:00:00 71047673 20925389 0.0 0.0 False False
In [9]:
quantile_to_flag = 12150 #assuming 30 pp passing through per minute
total_num_entry = len(df_new)
print(total_num_entry)
10467101
In [27]:
#calculate row wise difference in absolute value (accounting for backwards counting)
df_new['entry_diff_abs'] = df_new['Entries'].diff(periods = 1).abs()
df_new['exit_diff_abs'] = df_new['Exits'].diff(periods = 1).abs()
#reset the start of every new turnstile
df_new['entry_diff_abs'] = np.where(df_new['Turnstile_ID'] == df_new['Turnstile_ID'].shift(periods = 1,fill_value = 0),
                                df_new.entry_diff_abs,0)
df_new['exit_diff_abs'] = np.where(df_new['Turnstile_ID'] == df_new['Turnstile_ID'].shift(periods = 1,fill_value = 0),
                               df_new.exit_diff_abs,0)
In [28]:
# check for data that is abnormally high
df_new['flag_high_entry_diff'] = df_new['entry_diff_abs'] > quantile_to_flag
df_new['flag_high_exit_diff'] = df_new['exit_diff_abs'] > quantile_to_flag
In [ ]:
total_abno = np.sum([df_new.flag_high_entry_diff.sum(),df_new['flag_high_exit_diff'].sum()])
total_abno/total_num_entry*100 
#abnormal entries are less than 0.1% of total data, we will discard outliners here
In [13]:
rows_to_drop = df_new[df_new['flag_high_entry_diff']].index.tolist() + df_new[df_new['flag_high_exit_diff']].index.tolist()
In [14]:
df = pd.DataFrame(df_new.drop(rows_to_drop))
In [15]:
df = df.rename(columns = {'entry_diff_abs':'Entry_diff','exit_diff_abs':'Exit_diff'})
In [16]:
#calculate net flow as the sum of people entering station and people exiting station
df['Net_Flow'] = df['Entry_diff'] + df['Exit_diff']
In [17]:
columns_to_keep = ['Turnstile_ID','Station','Date','Time','Exit_diff','Entry_diff','Net_Flow']
df_cleaned = pd.DataFrame(df[columns_to_keep])
In [22]:
df_cleaned.head()
Out[22]:
Turnstile_ID Station Date Time Exit_diff Entry_diff Net_Flow
0 R170_02-06-00_14 ST-UNION SQ 14 ST-UNION SQ 2019-12-27 00:00:00 0.0 0.0 0.0
1 R170_02-00-04_14 ST-UNION SQ 14 ST-UNION SQ 2019-12-27 00:00:00 0.0 0.0 0.0
2 R170_02-00-03_14 ST-UNION SQ 14 ST-UNION SQ 2019-12-27 00:00:00 0.0 0.0 0.0
3 R170_02-00-02_14 ST-UNION SQ 14 ST-UNION SQ 2019-12-27 00:00:00 0.0 0.0 0.0
4 R170_02-06-01_14 ST-UNION SQ 14 ST-UNION SQ 2019-12-27 00:00:00 0.0 0.0 0.0
In [18]:
df_cleaned.to_pickle('../data/processed/data_final.pkl')
In [4]:
df = pd.read_pickle('../data/processed/data_final.pkl') #test if pickle works
df.head()
Out[4]:
Turnstile_ID Station Date Time Exit_diff Entry_diff Net_Flow
0 R001_00-00-00_WHITEHALL S-FRY WHITEHALL S-FRY 2018-12-29 00:00:00 0.0 0.0 0.0
1 R001_00-00-00_WHITEHALL S-FRY WHITEHALL S-FRY 2018-12-29 04:00:00 24.0 55.0 79.0
2 R001_00-00-00_WHITEHALL S-FRY WHITEHALL S-FRY 2018-12-29 08:00:00 35.0 163.0 198.0
3 R001_00-00-00_WHITEHALL S-FRY WHITEHALL S-FRY 2018-12-29 12:00:00 196.0 442.0 638.0
4 R001_00-00-00_WHITEHALL S-FRY WHITEHALL S-FRY 2018-12-29 16:00:00 199.0 789.0 988.0
In [4]:
#Station flow per year
station_flow_annual = df.groupby(by = 'Station')['Net_Flow'].sum()
station_flow_annual[0]
Out[4]:
10861644.0
In [5]:
station_flow_annual = df.groupby(by = 'Station')['Net_Flow'].sum()
station_flow_annual.sort_values(ascending = False,inplace = True)
print(station_flow_annual.index[:20]) #list of top 20
Index(['34 ST-PENN STA', 'GRD CNTRL-42 ST', '34 ST-HERALD SQ', '23 ST',
       'TIMES SQ-42 ST', '14 ST-UNION SQ', '42 ST-PORT AUTH', 'FULTON ST',
       '86 ST', '125 ST', 'CANAL ST', '59 ST COLUMBUS', '59 ST', '14 ST',
       '47-50 STS ROCK', '96 ST', 'FLUSHING-MAIN', 'CHAMBERS ST',
       'PATH NEW WTC', '72 ST'],
      dtype='object', name='Station')
In [143]:
top_20_crowded_stations = station_flow_annual.index[:20]
top_20_crowded_stations
Out[143]:
Index(['34 ST-PENN STA', 'GRD CNTRL-42 ST', '34 ST-HERALD SQ', '23 ST',
       'TIMES SQ-42 ST', '14 ST-UNION SQ', '42 ST-PORT AUTH', 'FULTON ST',
       '86 ST', '125 ST', 'CANAL ST', '59 ST COLUMBUS', '59 ST', '14 ST',
       '47-50 STS ROCK', '96 ST', 'FLUSHING-MAIN', 'CHAMBERS ST',
       'PATH NEW WTC', '72 ST'],
      dtype='object', name='Station')
In [5]:
#retrieve day of the week from Date
df['Date'] = pd.to_datetime(df['Date'])
df['Day'] = df['Date'].dt.dayofweek
df['Day'] +=1
In [6]:
# extract weekday or weekend from day of week
def weekday_weekend(day):
    if 1 <= day <= 5:
        return 'Weekday'
    if 6 <= day <= 7:
        return 'Weekend'
    
In [7]:
#apply function
df['Day_type'] = df.apply(lambda x: weekday_weekend(x['Day']), axis =1)
df.head(1)
Out[7]:
Turnstile_ID Station Date Time Exit_diff Entry_diff Net_Flow Day Day_type
0 R001_00-00-00_WHITEHALL S-FRY WHITEHALL S-FRY 2018-12-29 00:00:00 0.0 0.0 0.0 6 Weekend
In [8]:
#aggregate net flow based on station and day of week, and get the average traffic of each day of the week
avg_net_flow_by_day_of_week= df.groupby(['Station', 'Day'])['Net_Flow'].mean().reset_index(name='Mean_traffic_per_station_by_day_of_week')
avg_net_flow_by_day_of_week= avg_net_flow_by_day_of_week.sort_values('Mean_traffic_per_station_by_day_of_week', ascending =False)
avg_net_flow_by_day_of_week.head(5)
Out[8]:
Station Day Mean_traffic_per_station_by_day_of_week
2641 YORK ST 3 988.129450
2642 YORK ST 4 987.031385
2643 YORK ST 5 984.443844
2640 YORK ST 2 956.881410
2639 YORK ST 1 903.879085
In [9]:
# apply the weekday/weekend function on `Day`
avg_net_flow_by_day_of_week['Day_type'] = avg_net_flow_by_day_of_week.apply(lambda x: weekday_weekend(x['Day']), axis =1)
avg_net_flow_by_day_of_week.head()
Out[9]:
Station Day Mean_traffic_per_station_by_day_of_week Day_type
2641 YORK ST 3 988.129450 Weekday
2642 YORK ST 4 987.031385 Weekday
2643 YORK ST 5 984.443844 Weekday
2640 YORK ST 2 956.881410 Weekday
2639 YORK ST 1 903.879085 Weekday
In [10]:
#get the average traffic on weekend and weekday
avg_net_flow_by_day_type = avg_net_flow_by_day_of_week.groupby(['Station', 'Day_type'])['Mean_traffic_per_station_by_day_of_week'].mean().reset_index(name='Mean_traffic_per_station_by_day_type')
avg_net_flow_by_day_type.head(5)
Out[10]:
Station Day_type Mean_traffic_per_station_by_day_type
0 1 AV Weekday 578.748155
1 1 AV Weekend 257.329235
2 103 ST Weekday 509.931140
3 103 ST Weekend 326.846704
4 103 ST-CORONA Weekday 598.338452
In [11]:
#check if the output is what we intended
test = avg_net_flow_by_day_type.loc[avg_net_flow_by_day_type['Station']== 'YORK ST']
test
Out[11]:
Station Day_type Mean_traffic_per_station_by_day_type
754 YORK ST Weekday 964.073035
755 YORK ST Weekend 586.039944
In [12]:
#get the ratio of weekday and weekend 
# use this ratio as a proxy to determine what stations are filled with tourist 
# hypothesis: ratio that is close to 1 are 'touristy' stations cuz the 
# average traffic on weekdays are similar to weekends   
avg_net_flow_by_day_type['Ratio_of_traffic_per_station_by_day_type'] = avg_net_flow_by_day_type['Mean_traffic_per_station_by_day_type'].div(avg_net_flow_by_day_type.groupby('Station')['Mean_traffic_per_station_by_day_type'].shift(1))
avg_net_flow_by_day_type.head()
Out[12]:
Station Day_type Mean_traffic_per_station_by_day_type Ratio_of_traffic_per_station_by_day_type
0 1 AV Weekday 578.748155 NaN
1 1 AV Weekend 257.329235 0.444631
2 103 ST Weekday 509.931140 NaN
3 103 ST Weekend 326.846704 0.640962
4 103 ST-CORONA Weekday 598.338452 NaN
In [13]:
net_flow_ratio_histogram_distribution = px.histogram(avg_net_flow_by_day_type, x="Ratio_of_traffic_per_station_by_day_type", nbins=50,histnorm='percent')
net_flow_ratio_histogram_distribution.show()
In [91]:
avg_net_flow_by_day_type['Ratio_of_traffic_per_station_by_day_type'].describe()
#Most of the `ratio` fall below 2, and there's an outlier of 8. hence we'll restrict the range of ratio 
Out[91]:
count    379.000000
mean       0.702213
std        0.421376
min        0.203601
25%        0.590105
50%        0.683364
75%        0.751514
max        8.213386
Name: Ratio_of_traffic_per_station_by_day_type, dtype: float64
In [14]:
#restrict the range of ratio to below 2
avg_net_flow_by_day_type_remove_outlier = avg_net_flow_by_day_type.loc[avg_net_flow_by_day_type['Ratio_of_traffic_per_station_by_day_type'] <2 ]
In [95]:
avg_net_flow_by_day_type_remove_outlier.head(3)
Out[95]:
Station Day_type Mean_traffic_per_station_by_day_type Ratio_of_traffic_per_station_by_day_type
1 1 AV Weekend 257.329235 0.444631
3 103 ST Weekend 326.846704 0.640962
5 103 ST-CORONA Weekend 420.722794 0.703152
In [15]:
ratio_histogram_distribution = px.histogram(avg_net_flow_by_day_type_remove_outlier, x="Ratio_of_traffic_per_station_by_day_type", nbins=50,histnorm='percent')
ratio_histogram_distribution.show()
In [16]:
# with a ratio of between 0.9 and 1.1 are considered as 'touristy' stations
toursity_stations = avg_net_flow_by_day_type_remove_outlier.loc[((avg_net_flow_by_day_type_remove_outlier['Ratio_of_traffic_per_station_by_day_type'] >= 0.9) 
                                                                & (avg_net_flow_by_day_type_remove_outlier['Ratio_of_traffic_per_station_by_day_type'] <= 1.1))
                                                                , 'Station'].unique()
toursity_stations
len(toursity_stations)
Out[16]:
array(['69 ST', '74 ST-BROADWAY', 'BEACH 105 ST', 'BEACH 98 ST', 'BOWERY',
       'BROADWAY JCT', 'CHAUNCEY ST', 'CONEY IS-STILLW',
       'EASTN PKWY-MUSM', 'HEWES ST', 'HOWARD BCH JFK', 'JFK JAMAICA CT1',
       'LAFAYETTE AV', 'METROPOLITAN AV', 'PRINCE ST', 'ROCKAWAY PARK B',
       'SENECA AVE'], dtype=object)
Out[16]:
17
In [148]:
#these are the 20 stations of with the most traffic 
top_20_crowded_stations 
Out[148]:
Index(['34 ST-PENN STA', 'GRD CNTRL-42 ST', '34 ST-HERALD SQ', '23 ST',
       'TIMES SQ-42 ST', '14 ST-UNION SQ', '42 ST-PORT AUTH', 'FULTON ST',
       '86 ST', '125 ST', 'CANAL ST', '59 ST COLUMBUS', '59 ST', '14 ST',
       '47-50 STS ROCK', '96 ST', 'FLUSHING-MAIN', 'CHAMBERS ST',
       'PATH NEW WTC', '72 ST'],
      dtype='object', name='Station')
In [19]:
toursity_stations_set = set(toursity_stations)

top_20_crowded_stations_set = set(top_20_crowded_stations)
toursity_stations_set.intersection(top_20_crowded_stations_set)
# there are no toursity stations that fall under the top 20 crowded stations
Out[19]:
set()
In [20]:
df_w_most_locals = df.loc[df['Station'].isin(top_20_crowded_stations)]
df_w_most_locals_agg_total_traffic = df_w_most_locals.groupby(['Station','Day'])['Net_Flow'].sum().reset_index(name='Total_traffic')
df_w_most_locals_agg_total_traffic.head(5)
Out[20]:
Station Day Total_traffic
0 125 ST 1 6334111.0
1 125 ST 2 6783306.0
2 125 ST 3 6953455.0
3 125 ST 4 6854149.0
4 125 ST 5 6895993.0
In [21]:
df_w_most_locals_agg_total_traffic_copy = df_w_most_locals_agg_total_traffic.copy()
df_w_most_locals_agg_total_traffic_copy['Day'].replace({1: 'Monday', 2: 'Tuesday', 3: 'Wednesday', 4: 'Thursday', 5: 'Friday', 6: 'Saturday', 7: 'Sunday'}, inplace = True)
In [22]:
#top 10 stations by day of the week
line_fig = px.line(df_w_most_locals_agg_total_traffic_copy, x="Day", y="Total_traffic", color = 'Station', title='Total traffic per day of the week')
line_fig.show()
In [118]:
df_w_most_locals_agg_total_traffic_copy['Station'].unique()
Out[118]:
array(['125 ST', '14 ST', '14 ST-UNION SQ', '23 ST', '34 ST-HERALD SQ',
       '34 ST-PENN STA', '42 ST-PORT AUTH', '47-50 STS ROCK', '59 ST',
       '59 ST COLUMBUS', '72 ST', '86 ST', '96 ST', 'CANAL ST',
       'CHAMBERS ST', 'FLUSHING-MAIN', 'FULTON ST', 'GRD CNTRL-42 ST',
       'PATH NEW WTC', 'TIMES SQ-42 ST'], dtype=object)
In [23]:
# '34 ST-PENN STA', 'GRD CNTRL-42 ST', '34 ST-HERALD SQ', '23 ST' and 'TIMES SQ-42 ST' are the top 5 stations
df_5_stations_w_most_locals = df.loc[df['Station'].isin(['34 ST-PENN STA', 'GRD CNTRL-42 ST', '34 ST-HERALD SQ', '23 ST','TIMES SQ-42 ST'])]
df_5_stations_w_most_locals_agg_total_traffic = df_5_stations_w_most_locals.groupby(['Station','Day'])['Net_Flow'].sum().reset_index(name='Total_traffic')
df_5_stations_w_most_locals_agg_total_traffic_sorted = df_5_stations_w_most_locals_agg_total_traffic.sort_values(by = ['Day', 'Total_traffic'], ascending = [True,False])
df_5_stations_w_most_locals_agg_total_traffic_sorted.head(10)
Out[23]:
Station Day Total_traffic
14 34 ST-PENN STA 1 14677015.0
21 GRD CNTRL-42 ST 1 13173028.0
7 34 ST-HERALD SQ 1 10789757.0
0 23 ST 1 8966665.0
28 TIMES SQ-42 ST 1 8480698.0
15 34 ST-PENN STA 2 15832840.0
22 GRD CNTRL-42 ST 2 14623594.0
8 34 ST-HERALD SQ 2 11538064.0
1 23 ST 2 10163535.0
29 TIMES SQ-42 ST 2 9429905.0
In [24]:
df_5_stations_w_most_locals_agg_total_traffic_copy = df_5_stations_w_most_locals_agg_total_traffic_sorted.copy()
df_5_stations_w_most_locals_agg_total_traffic_copy['Day'].replace({1: 'Monday', 2: 'Tuesday', 3: 'Wednesday', 4: 'Thursday', 5: 'Friday', 6: 'Saturday', 7: 'Sunday'}, inplace = True)


top_5_line_fig = px.line(df_5_stations_w_most_locals_agg_total_traffic_copy, x="Day", y="Total_traffic", color = 'Station', title='Total traffic per day of the week')
top_5_line_fig.show()
In [25]:
# retrieve the hour from the `Time` variable
df_5_stations_w_most_locals['Time'] = pd.to_datetime(df_5_stations_w_most_locals['Time'], format='%H:%M:%S')
df_5_stations_w_most_locals['Hour'] = df_5_stations_w_most_locals['Time'].dt.hour
/Users/admin/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

/Users/admin/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [26]:
df_5_stations_w_most_locals.head(1)
Out[26]:
Turnstile_ID Station Date Time Exit_diff Entry_diff Net_Flow Day Day_type Hour
261086 R012_00-00-00_34 ST-PENN STA 34 ST-PENN STA 2018-12-29 1900-01-01 03:00:00 0.0 0.0 0.0 6 Weekend 3
In [26]:
df_5_stations_w_most_locals_selected_hours = df_5_stations_w_most_locals.loc[df_5_stations_w_most_locals['Hour'].isin(np.arange(7,24))]
sorted(df_5_stations_w_most_locals_selected_hours['Hour'].unique())
Out[26]:
[7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]
In [27]:
# aggregate the timing into different time blocks 
# `Time` aggregates entry and exits on 4 hourly basis. 
# the timestamp 09:00 refers to data at 05:00

def time_block(hour):
    if 9 <= hour == 12: #in reality: 5am to 9am; morning 
        return 'Morning'
    if 13 <= hour <= 18: #in reality: 9am to 2pm; afternoon
        return 'Afternoon'
    if 19 <= hour <= 21: #in reality: 3pm to 5pm; late afternoon  
        return 'Late afternoon'
    if 22 <= hour <= 26: #in reality: 6pm to 10pm; evening 
        return 'Evening'
In [28]:
# check if the assignment of time_block is as intended 
df_5_stations_w_most_locals_selected_hours['Time_of_day'] = df_5_stations_w_most_locals_selected_hours['Hour'].apply(time_block)
df_5_stations_w_most_locals_selected_hours.head(1)
/Users/admin/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Out[28]:
Turnstile_ID Station Date Time Exit_diff Entry_diff Net_Flow Day Day_type Hour Time_of_day
261087 R012_00-00-00_34 ST-PENN STA 34 ST-PENN STA 2018-12-29 1900-01-01 07:00:00 22.0 1.0 23.0 6 Weekend 7 None
In [39]:
len(df_5_stations_w_most_locals_selected_hours)
Out[39]:
535272
In [29]:
#since there are more traffic on weekdays, we'll further break down 
# the traffic into the various time blocks 
df_5_stations_w_most_locals_selected_hours_weekday = df_5_stations_w_most_locals_selected_hours.loc[df_5_stations_w_most_locals_selected_hours['Day_type']== 'Weekday']
In [30]:
df_5_stations_w_most_locals_selected_hours_weekday_agg = df_5_stations_w_most_locals_selected_hours_weekday.groupby(['Station','Time_of_day'])['Net_Flow'].sum().reset_index(name='Total_traffic')
df_5_stations_w_most_locals_selected_hours_weekday_agg = df_5_stations_w_most_locals_selected_hours_weekday_agg.sort_values(by = ['Time_of_day', 'Total_traffic'], ascending = [True,False])
df_5_stations_w_most_locals_selected_hours_weekday_agg
Out[30]:
Station Time_of_day Total_traffic
8 34 ST-PENN STA Afternoon 19853114.0
12 GRD CNTRL-42 ST Afternoon 18718381.0
0 23 ST Afternoon 13548517.0
4 34 ST-HERALD SQ Afternoon 11214302.0
16 TIMES SQ-42 ST Afternoon 8535016.0
9 34 ST-PENN STA Evening 5379512.0
5 34 ST-HERALD SQ Evening 3174025.0
17 TIMES SQ-42 ST Evening 2608580.0
1 23 ST Evening 1671010.0
13 GRD CNTRL-42 ST Evening 1454898.0
14 GRD CNTRL-42 ST Late afternoon 24012198.0
10 34 ST-PENN STA Late afternoon 21158432.0
6 34 ST-HERALD SQ Late afternoon 20565146.0
2 23 ST Late afternoon 16859267.0
18 TIMES SQ-42 ST Late afternoon 16004250.0
15 GRD CNTRL-42 ST Morning 9801016.0
7 34 ST-HERALD SQ Morning 9661007.0
11 34 ST-PENN STA Morning 9626847.0
19 TIMES SQ-42 ST Morning 7695956.0
3 23 ST Morning 6905260.0
In [31]:
# hypothesis: if there are more traffic at a certain station 
# at a certain time block compared to other stations or other time block,
# this can further optimise the allocation of street teams

init_notebook_mode()

data = df_5_stations_w_most_locals_selected_hours_weekday_agg

trace = graph.Heatmap(z=data.Total_traffic,y=data.Station,x=data.Time_of_day)

plots = [trace]

layout = graph.Layout(
  yaxis=dict(
    type='category',
    categoryorder='category ascending',
    showgrid=False
    ),
  xaxis=dict(
    type='category',
    categoryorder='array',
    categoryarray=['Morning', 'Afternoon', 'Late afternoon', 'Evening'],
    showgrid=False
    )
  )

fig = graph.Figure(data=plots, layout=layout)
iplot(fig)
In [44]:
#across the 5 stations, traffic builds up steadily from morning to late afternoon. Could start with a smaller team size
#in the morning, and increase the team size in the afternoon and late afternoon
#Grand Central-42 St is the most crowded in the late afternoon. Could increase the number of street teams 
#during this period, by deploying teams from stations that are less crowded

#by evening time, traffic reduced drastically compared to morning and late afternoon. could further investigate
#if other stations are more crowded in the evening, and deploy street teams at these 5 stations to those other stations
In [9]:
total_traffic_by_day_type = df.groupby(['Station','Day_type'])['Net_Flow'].sum().reset_index(name='Total_traffic_by_day_type')
total_traffic_weekend = total_traffic_by_day_type.loc[total_traffic_by_day_type['Day_type']=='Weekend']
total_traffic_weekend_sorted = total_traffic_weekend.sort_values(by='Total_traffic_by_day_type', ascending = False)
total_traffic_weekend_sorted.head(20)
# the top 10 stations that are most crowded on weekends are also the ones that have highest traffic on weekdays.
# can deploy street teams to these 10 stations on both weekdays and weekends 
Out[9]:
Station Day_type Total_traffic_by_day_type
123 34 ST-PENN STA Weekend 17575259.0
119 34 ST-HERALD SQ Weekend 14301315.0
29 14 ST-UNION SQ Weekend 12232784.0
707 TIMES SQ-42 ST Weekend 12074752.0
137 42 ST-PORT AUTH Weekend 11864369.0
467 GRD CNTRL-42 ST Weekend 11491777.0
93 23 ST Weekend 10322934.0
19 125 ST Weekend 10075957.0
221 86 ST Weekend 10047368.0
347 CANAL ST Weekend 9382582.0
173 59 ST COLUMBUS Weekend 8370481.0
453 FULTON ST Weekend 8257781.0
27 14 ST Weekend 8150884.0
171 59 ST Weekend 7571403.0
231 96 ST Weekend 7107182.0
195 72 ST Weekend 6646462.0
251 ATL AV-BARCLAY Weekend 6492960.0
727 W 4 ST-WASH SQ Weekend 6403133.0
515 JKSN HT-ROOSVLT Weekend 6213703.0
435 FLUSHING-MAIN Weekend 5842157.0
In [171]:
highest_traffic_station_weekend = total_traffic_weekend_sorted.head(20)['Station'].tolist()
highest_traffic_station_weekend_set = set(highest_traffic_station_weekend)
common = toursity_stations_set.intersection(highest_traffic_station_weekend_set)
common
toursity_stations_set
highest_traffic_station_weekend
# there are no toursity stations that fall under the top 20 crowded stations on weekends
Out[171]:
set()
Out[171]:
{'69 ST',
 '74 ST-BROADWAY',
 'BEACH 105 ST',
 'BEACH 98 ST',
 'BOWERY',
 'BROADWAY JCT',
 'CHAUNCEY ST',
 'CONEY IS-STILLW',
 'EASTN PKWY-MUSM',
 'HEWES ST',
 'HOWARD BCH JFK',
 'JFK JAMAICA CT1',
 'LAFAYETTE AV',
 'METROPOLITAN AV',
 'PRINCE ST',
 'ROCKAWAY PARK B',
 'SENECA AVE'}
Out[171]:
['34 ST-PENN STA',
 '34 ST-HERALD SQ',
 '14 ST-UNION SQ',
 'TIMES SQ-42 ST',
 '42 ST-PORT AUTH',
 'GRD CNTRL-42 ST',
 '23 ST',
 '125 ST',
 '86 ST',
 'CANAL ST',
 '59 ST COLUMBUS',
 'FULTON ST',
 '14 ST',
 '59 ST',
 '96 ST',
 '72 ST',
 'ATL AV-BARCLAY',
 'W 4 ST-WASH SQ',
 'JKSN HT-ROOSVLT',
 'FLUSHING-MAIN']
In [173]:
top_20_crowded_stations_set.intersection(highest_traffic_station_weekend_set)
len(top_20_crowded_stations_set.intersection(highest_traffic_station_weekend_set))
#among the top 20 crowded stations, 17 of them have the highest traffic on weekends
#this reinforces the recommendation to deploy street teams to these 10 stations on both weekdays and weekends
Out[173]:
{'125 ST',
 '14 ST',
 '14 ST-UNION SQ',
 '23 ST',
 '34 ST-HERALD SQ',
 '34 ST-PENN STA',
 '42 ST-PORT AUTH',
 '59 ST',
 '59 ST COLUMBUS',
 '72 ST',
 '86 ST',
 '96 ST',
 'CANAL ST',
 'FLUSHING-MAIN',
 'FULTON ST',
 'GRD CNTRL-42 ST',
 'TIMES SQ-42 ST'}
Out[173]:
17

This is a preliminary EDA based on traffic flow using MTA data. Can further optimize street team allocation by enriching the data with

  • geographical attributes. E.g locations of tech firms, to investigate if the 5 proposed stations have tech firms that are nearby. This increases the likelihood of reaching to the target audience (people who are interested in tech) who would attend the gala event and contribute to the cause.
  • results from A/B testing, to investigate the certain profiles of people who are keen/disinterested depending on the length of pitch etc.
In [ ]: